Install any packages you might be missing:
library(dplyr)
library(ggplot2)
library(leaflet) # plotting spatial data
library(sf) # handle simple features objects
library(units) # set_units() for converting units objects
When you’ve downloaded and unzipped spatial_intro_data.zip, you will find these files:
vnf_ca_2018.csvpm25_ca_2018.csvpm25_sites.csvshapes <- This is a folderIt is important to to have them all in the directory where you currently keep data so that the new structure looks like this:
data folderL vnf_ca_2018.csvL pm25_ca_2018.csvL pm25_sites.csvL shapesThen copy and paste the directory path to the one below, and comment out the second line.
data.dir ="/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data"
Oftentimes spatial data are stored as shapefiles (.shp). Shapefiles usually come in a set of at least 4 files:
.shp: main shapefile.shx: spatial index.dbf: dBase file which stores the non-spatial columns.prj: definition of the projection of the spatial dataFor example, the shapefile ca_counties.shp and its related files consist of polygons for California counties. We read shapefiles with the st_read() function from the sf package:
ca.counties = st_read(paste0(data.dir,"/shapes","/ca_counties.shp"))
## Reading layer `ca_counties' from data source `/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/shapes/ca_counties.shp' using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
ca.counties
## Simple feature collection with 58 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## county population geometry
## 1 Alameda 1666753 MULTIPOLYGON (((-122.2809 3...
## 2 Alpine 1101 MULTIPOLYGON (((-120.0733 3...
## 3 Amador 39383 MULTIPOLYGON (((-121.0273 3...
## 4 Butte 231256 MULTIPOLYGON (((-121.8565 3...
## 5 Calaveras 45602 MULTIPOLYGON (((-120.6309 3...
## 6 Colusa 21627 MULTIPOLYGON (((-122.0802 3...
## 7 Contra Costa 1150215 MULTIPOLYGON (((-122.2677 3...
## 8 Del Norte 27828 MULTIPOLYGON (((-124.3161 4...
## 9 El Dorado 190678 MULTIPOLYGON (((-121.1186 3...
## 10 Fresno 994400 MULTIPOLYGON (((-119.7054 3...
ca.counties %>% class()
## [1] "sf" "data.frame"
An sf object is also a data.frame so most of the time typically data.frame operations work the same way. geometry is the unique column to look out for.
You can also create your own sf objects, most typically for Point data when given coordinates (longitude and latitude) of the features, using the st_as_sf() function.
vnf = read.csv(paste0(data.dir, "/vnf_ca_2018.csv")) %>%
mutate(date = as.Date(date))
head(vnf)
## vid date longitude latitude temp_bb area_bb
## 1 VNF2018010104284 2018-01-01 -120.1438 36.44304 1878 1.63433
## 2 VNF2018010107350 2018-01-01 -120.1403 36.43734 1902 2.08423
## 3 VNF2018010104283 2018-01-01 -119.9612 36.61958 1354 13.57560
## 4 VNF2018010104304 2018-01-01 -115.0597 32.98339 1117 12.99080
## 5 VNF2018010107182 2018-01-01 -119.6701 35.36865 1581 2.30227
## 6 VNF2018010104296 2018-01-01 -118.5173 34.33804 1291 4.28430
vnf = st_as_sf(vnf,
coords=c('longitude','latitude'), # coordinates columns
crs=4326, # projection
remove=TRUE) # remove coordinate columns
vnf
## Simple feature collection with 10022 features and 4 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -124.008 ymin: 32.59483 xmax: -114.2726 ymax: 42.00755
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## vid date temp_bb area_bb geometry
## 1 VNF2018010104284 2018-01-01 1878 1.63433 POINT (-120.1438 36.44304)
## 2 VNF2018010107350 2018-01-01 1902 2.08423 POINT (-120.1403 36.43734)
## 3 VNF2018010104283 2018-01-01 1354 13.57560 POINT (-119.9612 36.61958)
## 4 VNF2018010104304 2018-01-01 1117 12.99080 POINT (-115.0597 32.98339)
## 5 VNF2018010107182 2018-01-01 1581 2.30227 POINT (-119.6701 35.36865)
## 6 VNF2018010104296 2018-01-01 1291 4.28430 POINT (-118.5173 34.33804)
## 7 VNF2018010104302 2018-01-01 1182 8.58294 POINT (-117.8234 33.61306)
## 8 VNF2018010104303 2018-01-01 1058 24.28530 POINT (-116.1025 33.0821)
## 9 VNF2018010107181 2018-01-01 1292 8.23556 POINT (-119.4799 35.88723)
## 10 VNF2018010104294 2018-01-01 1105 14.96500 POINT (-119.4011 34.59208)
vnf %>%
filter(substr(date,6,7) %>% as.numeric() == 7) %>%
leaflet() %>%
addProviderTiles('Wikimedia') %>%
addCircles(color='firebrick', fillOpacity=1, opacity=1, radius=500)
metro.rails = st_read(paste0(data.dir, '/shapes/metro_lines.shp'))
## Reading layer `metro_lines' from data source `/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/shapes/metro_lines.shp' using driver `ESRI Shapefile'
## Simple feature collection with 480 features and 1 field
## geometry type: MULTILINESTRING
## dimension: XY
## bbox: xmin: -118.4917 ymin: 33.768 xmax: -117.8899 ymax: 34.17085
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
line.pal = colorFactor(c('blue','cornflowerblue','darkgoldenrod','darkgreen','firebrick'),
domain=metro.rails$line)
metro.rails %>%
leaflet() %>%
addProviderTiles('Wikimedia') %>%
addPolylines(color=~line.pal(line), label=~line, opacity=.9)
Los Angeles county is an example of a multipolygon.
ca.counties %>%
leaflet() %>%
addProviderTiles('Wikimedia') %>%
addPolygons(weight=1, fillOpacity=.25, label=~county)
Typically come in Point geometries. Any place in the region has a value (e.g., PM2.5 level), but the data is a sample of a few points (e.g., PM2.5 stations).
pm25 = read.csv(paste0(data.dir, "/pm25_ca_2018.csv"))
pm25.sites = read.csv(paste0(data.dir, "/pm25_sites.csv"))
pm_ca<-merge(pm25 %>% filter(date=="11/10/18"), pm25.sites, by='site_id')
pm25.pal = colorNumeric(c('darkgreen','goldenrod','brown','brown','brown'),
domain=pm_ca$pm25)
leaflet(pm_ca) %>%
addProviderTiles('CartoDB.Positron') %>%
addCircles(label=~paste0(pm25, ' ug/m3'), color=~pm25.pal(pm25),
opacity=1, fillOpacity=1, radius=500) %>%
addLegend('bottomleft', pal=pm25.pal, values=pm_ca$pm25,
title='PM2.5 (ug/m3)', opacity=1)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
Typically come in Polygon geometries. They aggregate/summarize values (e.g., population, income) over geographic divisions (e.g., countries, states, counties, census blocks, etc.).
pop.pal = colorNumeric(c('beige','brown','brown'),
domain=ca.counties$population)
ca.counties %>%
leaflet() %>%
addProviderTiles('CartoDB.Positron') %>%
addPolygons(fillColor=~pop.pal(population), weight=.5, fillOpacity=.9,
label=~paste0(county, ': ', population))
Typically come in Point geometries. Each point is an occurance of an event (e.g., fire, crime, etc.); no point means nothing happened.
temp.pal = colorNumeric(c('brown','brown','beige'), domain=vnf$temp_bb)
vnf %>%
filter(substr(date,6,7) %>% as.numeric() == 7) %>%
arrange(temp_bb) %>% # to show hottest spots more prominently
leaflet() %>%
addProviderTiles('CartoDB.DarkMatter') %>%
addCircles(label=~paste0(temp_bb,' K'), color=~temp.pal(temp_bb),
fillOpacity=.8, opacity=.8, radius=500) %>%
addLegend('bottomleft', pal=temp.pal, values=vnf$temp_bb,
title='Fire Temp.(K)', opacity=1)
Typically, spatial data are projected from the earth (3D) to a map (2D), but since the earth is not a sphere, no projection system is perfect (relevant XKCD). There are global projections (e.g., WGS84, NAD83) and local projections (e.g, UTM11), each in different units (e.g., degrees for WGS84, meters for UTM11). This affects spatial operations such as calculating distance (st_distance()), area (st_area()), etc. Operations between spatial (sf) objects also require them to be in the same projection.
There are several ways to define the projection, but the easiest is using the Coordinates Reference System, which assigned a number (SRID) to most common projections. (Try Googling “WGS84 srid” or “WGS84 crs”) A few common CRS:
tryCatch({
st_contains(ca.counties %>% st_transform(crs=4269),
vnf)
}, error = function(e) {
print(e)
}
)
## <simpleError in st_geos_binop("contains", x, y, sparse = sparse, prepared = prepared): st_crs(x) == st_crs(y) is not TRUE>
st_transform() re-project the spatial data to a different projection. This is useful for when the operations require a different unit (e.g., meters instead of degrees); also useful for when you when you receive data in custom projections that you want to have in more standard projections.
We can see the result of re-projection in the geometry column:
vnf
## Simple feature collection with 10022 features and 4 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -124.008 ymin: 32.59483 xmax: -114.2726 ymax: 42.00755
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## vid date temp_bb area_bb geometry
## 1 VNF2018010104284 2018-01-01 1878 1.63433 POINT (-120.1438 36.44304)
## 2 VNF2018010107350 2018-01-01 1902 2.08423 POINT (-120.1403 36.43734)
## 3 VNF2018010104283 2018-01-01 1354 13.57560 POINT (-119.9612 36.61958)
## 4 VNF2018010104304 2018-01-01 1117 12.99080 POINT (-115.0597 32.98339)
## 5 VNF2018010107182 2018-01-01 1581 2.30227 POINT (-119.6701 35.36865)
## 6 VNF2018010104296 2018-01-01 1291 4.28430 POINT (-118.5173 34.33804)
## 7 VNF2018010104302 2018-01-01 1182 8.58294 POINT (-117.8234 33.61306)
## 8 VNF2018010104303 2018-01-01 1058 24.28530 POINT (-116.1025 33.0821)
## 9 VNF2018010107181 2018-01-01 1292 8.23556 POINT (-119.4799 35.88723)
## 10 VNF2018010104294 2018-01-01 1105 14.96500 POINT (-119.4011 34.59208)
vnf %>% st_transform(crs=32611)
## Simple feature collection with 10022 features and 4 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -94089.09 ymin: 3606371 xmax: 750819.2 ymax: 4670569
## epsg (SRID): 32611
## proj4string: +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
## First 10 features:
## vid date temp_bb area_bb geometry
## 1 VNF2018010104284 2018-01-01 1878 1.63433 POINT (218206.2 4037685)
## 2 VNF2018010107350 2018-01-01 1902 2.08423 POINT (218500.3 4037043)
## 3 VNF2018010104283 2018-01-01 1354 13.57560 POINT (235179.9 4056757)
## 4 VNF2018010104304 2018-01-01 1117 12.99080 POINT (681305.8 3651117)
## 5 VNF2018010107182 2018-01-01 1581 2.30227 POINT (257417.1 3917199)
## 6 VNF2018010104296 2018-01-01 1291 4.28430 POINT (360430.4 3800681)
## 7 VNF2018010104302 2018-01-01 1182 8.58294 POINT (423614.4 3719558)
## 8 VNF2018010104303 2018-01-01 1058 24.28530 POINT (583765 3660747)
## 9 VNF2018010107181 2018-01-01 1292 8.23556 POINT (276156.5 3974281)
## 10 VNF2018010104294 2018-01-01 1105 14.96500 POINT (279794.3 3830428)
Below we calculate the area of the first 6 counties in ca.counties. st_area() is smart enough to return results in meters-squared in both cases, but when given coordinates in degrees, it will calculate based on an ellipsoidal earth, which is not as accurate as local projections (e.g., UTM11).
wgs84.area = ca.counties %>%
st_area() %>% set_units(km^2)
utm11.area = ca.counties %>%
st_transform(crs=32611) %>%
st_area() %>% set_units(km^2)
utm10.area = ca.counties %>%
st_transform(crs=32610) %>%
st_area() %>% set_units(km^2)
tibble(county=ca.counties$county, wgs84.area, utm11.area, utm10.area)
## # A tibble: 58 x 4
## county wgs84.area utm11.area utm10.area
## <fct> <[km^2]> <[km^2]> <[km^2]>
## 1 Alameda 2127.223 km^2 2135.420 km^2 2126.020 km^2
## 2 Alpine 1924.850 km^2 1926.178 km^2 1926.953 km^2
## 3 Amador 1569.405 km^2 1572.102 km^2 1569.797 km^2
## 4 Butte 4343.750 km^2 4356.986 km^2 4341.850 km^2
## 5 Calaveras 2685.627 km^2 2689.914 km^2 2686.533 km^2
## 6 Colusa 2994.955 km^2 3007.702 km^2 2992.906 km^2
## 7 Contra Costa 2081.750 km^2 2089.833 km^2 2080.542 km^2
## 8 Del Norte 3184.861 km^2 3208.644 km^2 3182.833 km^2
## 9 El Dorado 4626.620 km^2 4633.697 km^2 4628.272 km^2
## 10 Fresno 15568.555 km^2 15579.028 km^2 15591.683 km^2
## # … with 48 more rows
plot(data.frame(utm11.area, differences=(wgs84.area - utm11.area)))